{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "92c59c8e",
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import defaultdict\n",
    "import os\n",
    "import re\n",
    "import jieba\n",
    "import codecs\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "4b2e164b",
   "metadata": {},
   "outputs": [],
   "source": [
    "#停用词典过滤\n",
    "stopwords = set()\n",
    "fr = codecs.open('D:\\Davion\\停用词.txt','r',encoding='utf-8')\n",
    "for word in fr:\n",
    "\tstopwords.add(word.strip()) #Python strip() 方法用于移除字符串头尾指定的字符（默认为空格或换行符）或字符序列。\n",
    "#读取否定词文件\n",
    "not_word_file = codecs.open('D:\\Davion\\否定词.txt','r+',encoding='utf-8')\n",
    "not_word_list = not_word_file.readlines()\n",
    "not_word_list = [w.strip() for w in not_word_list]\n",
    "#读取程度副词文件\n",
    "degree_file = codecs.open('D:\\Davion\\程度副词.txt','r+',encoding='utf-8')\n",
    "degree_list = degree_file.readlines()\n",
    "degree_list = [item.split(',')[0] for item in degree_list]\n",
    "#生成新的停用词表\t\t\t\n",
    "with codecs.open('D:\\Davion\\stopwords.txt','w',encoding='utf-8') as f:\n",
    "\tfor word in stopwords:\n",
    "\t\tif(word not in not_word_list) and (word not in degree_list):\n",
    "\t\t\tf.write(word+'\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "e5b581e5",
   "metadata": {},
   "outputs": [],
   "source": [
    "#jieba分词后去除停用词\n",
    "def seg_word(sentence):\n",
    "\tseg_list = jieba.cut(sentence)\n",
    "\tseg_result = []\n",
    "\tfor i in seg_list:\n",
    "\t\tseg_result.append(i)\n",
    "        #读取停用词文件\n",
    "\tstopwords = set()\n",
    "\twith codecs.open('D:\\Davion\\stopwords.txt','r',encoding='utf-8') as fr:\n",
    "\t\tfor i in fr:\n",
    "\t\t\tstopwords.add(i.strip())\n",
    "        #去除停用词\n",
    "\treturn list(filter(lambda x :x not in stopwords,seg_result))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "ca3b8400",
   "metadata": {},
   "outputs": [],
   "source": [
    "#找出文本中的情感词、否定词和程度副词\n",
    "def classify_words(word_list):\n",
    "\t#读取情感词典文件\n",
    "\tsen_file = codecs.open('D:\\Davion\\Jiang20Yao21_media_sentiment_score.txt','r+',encoding='utf-8')\n",
    "\t#获取词典文件内容\n",
    "\tsen_list = sen_file.readlines()\n",
    "\t#创建情感字典\n",
    "\tsen_dict = defaultdict()\n",
    "\t#读取词典每一行的内容，将其转换成字典对象，key为情感词，value为其对应的权重\n",
    "\tfor i in sen_list:\n",
    "\t\tif len(i.split(' '))==2:\n",
    "\t\t\tsen_dict[i.split(' ')[0]] = i.split(' ')[1]\n",
    " \n",
    "\t#读取否定词文件\n",
    "\tnot_word_file = codecs.open('D:\\Davion\\否定词.txt','r+',encoding='utf-8')\n",
    "\tnot_word_list = not_word_file.readlines()\n",
    "\t#读取程度副词文件\n",
    "\tdegree_file = codecs.open('D:\\Davion\\程度副词.txt','r+',encoding='utf-8')\n",
    "\tdegree_list = degree_file.readlines()\n",
    "\tdegree_dict = defaultdict()\n",
    "\tfor i in degree_list:\n",
    "\t\tdegree_dict[i.split(',')[0]] = i.split(',')[1]\n",
    " \n",
    "\tsen_word = dict()\n",
    "\tnot_word = dict()\n",
    "\tdegree_word = dict()\n",
    "\t#分类\n",
    "\tfor i in range(len(word_list)):\n",
    "\t\tword = word_list[i]\n",
    "\t\tif word in sen_dict.keys() and word not in not_word_list and word not in degree_dict.keys():\n",
    "\t\t\t#找出分词结果中在情感字典中的词\n",
    "\t\t\tsen_word[i] = sen_dict[word]\n",
    "\t\telif word in not_word_list and word not in degree_dict.keys():\n",
    "\t\t\t#分词结果中在否定词列表中的词\n",
    "\t\t\tnot_word[i] = -1\n",
    "\t\telif word in degree_dict.keys():\n",
    "\t\t\t#分词结果中在程度副词中的词\n",
    "\t\t\tdegree_word[i]  = degree_dict[word]\n",
    " \n",
    " \n",
    "\t#关闭打开的文件\n",
    "\tsen_file.close()\n",
    "\tnot_word_file.close()\n",
    "\tdegree_file.close()\n",
    "\t#返回分类结果\n",
    "\treturn sen_word,not_word,degree_word"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "423a3038",
   "metadata": {},
   "outputs": [],
   "source": [
    "#计算情感词的分数\n",
    "def score_sentiment(sen_word,not_word,degree_word,seg_result):\n",
    "\t#权重初始化为1\n",
    "\tW = 1\n",
    "\tscore = 0\n",
    "\t#情感词下标初始化\n",
    "\tsentiment_index = -1\n",
    "\t#情感词的位置下标集合\n",
    "\tsentiment_index_list = list(sen_word.keys())\n",
    "\t#遍历分词结果\n",
    "\tfor i in range(0,len(seg_result)):\n",
    "\t\t#如果是情感词\n",
    "\t\tif i in sen_word.keys():\n",
    "\t\t\t#权重*情感词得分\n",
    "\t\t\tscore += W*float(sen_word[i])\n",
    "\t\t\t#情感词下标加一，获取下一个情感词的位置\n",
    "\t\t\tsentiment_index += 1\n",
    "\t\t\tif sentiment_index < len(sentiment_index_list)-1:\n",
    "\t\t\t\t#判断当前的情感词与下一个情感词之间是否有程度副词或否定词\n",
    "\t\t\t\tfor j in range(sentiment_index_list[sentiment_index],sentiment_index_list[sentiment_index+1]):\n",
    "\t\t\t\t\t#更新权重，如果有否定词，权重取反\n",
    "\t\t\t\t\tif j in not_word.keys():\n",
    "\t\t\t\t\t\tW *= -1\n",
    "\t\t\t\t\telif j in degree_word.keys():\n",
    "\t\t\t\t\t\tW *= float(degree_word[j])\t\n",
    "\t\t#定位到下一个情感词\n",
    "\t\tif sentiment_index < len(sentiment_index_list)-1:\n",
    "\t\t\ti = sentiment_index_list[sentiment_index+1]\n",
    "\treturn score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "22f8ba09",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>comment</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>这是有啥不好的消息要出来吗？垃圾的不能再垃圾了</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>破火车的钱都是高利贷 被新都酒店搞的破产了 陆虎别也抵押了&lt;br&gt;日前，一名男子走到华山栈道...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>没鸡八事，去他马币的，不看了</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>退市价格3元两大杯</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>价值归零的股票，居然有这么多的脑残王八在买入，这些王八不下地狱谁下地狱呢？哈哈哈哈哈</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                             comment\n",
       "0                            这是有啥不好的消息要出来吗？垃圾的不能再垃圾了\n",
       "1  破火车的钱都是高利贷 被新都酒店搞的破产了 陆虎别也抵押了<br>日前，一名男子走到华山栈道...\n",
       "2                                     没鸡八事，去他马币的，不看了\n",
       "3                                          退市价格3元两大杯\n",
       "4         价值归零的股票，居然有这么多的脑残王八在买入，这些王八不下地狱谁下地狱呢？哈哈哈哈哈"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#读取待分析文本数据\n",
    "data = pd.read_excel('D:\\Davion\\data.xlsx').astype(str)\n",
    "sentence = data.comment\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "7ce3493b",
   "metadata": {},
   "outputs": [],
   "source": [
    "#计算得分\n",
    "def sentiment_score(sentence):\n",
    "\t#1.对文档分词\n",
    "\tseg_list = seg_word(sentence)\n",
    "\t#2.将分词结果转换成字典，找出情感词、否定词和程度副词\n",
    "\tsen_word,not_word,degree_word = classify_words(seg_list)\n",
    "\t#3.计算得分\n",
    "\tscore = score_sentiment(sen_word,not_word,degree_word,seg_list)\n",
    "\treturn score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "62dc529f",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache C:\\Users\\Davion\\AppData\\Local\\Temp\\jieba.cache\n",
      "Loading model cost 0.524 seconds.\n",
      "Prefix dict has been built successfully.\n"
     ]
    }
   ],
   "source": [
    "#导出分词和情感分值\n",
    "data['cut_comment'] = sentence.transform(seg_word)\n",
    "data['sentiment_score'] = sentence.transform(sentiment_score)\n",
    "data.to_excel(\"D:\\Davion\\data_result.xlsx\",index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "db0582a0",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
